In [1]:
"""
*1. import the image with a red rectangle to get the object  
*2. import other png images to use that as a background
*3. Crop the region of interest from the img with the target object
*4. Write a dataloader that pastes the object of interest on a random background image
*5. Add very many transformations to this dataloader - COULD ADD MANY MORE
*6. Write up a neural net class for the detection 
*7. Train the neural net and see how the training goes

8. If things go smoothly check the model on other datasets with different inter-class objects.
    E.g., different triangles

9. Check the effect of pretraining using available datasets
10. Read about one data sample transfer learning 
"""
Out[1]:
'\n*1. import the image with a red rectangle to get the object  \n*2. import other png images to use that as a background\n*3. Crop the region of interest from the img with the target object\n*4. Write a dataloader that pastes the object of interest on a random background image\n*5. Add very many transformations to this dataloader - COULD ADD MANY MORE\n*6. Write up a neural net class for the detection \n*7. Train the neural net and see how the training goes\n\n8. If things go smoothly check the model on other datasets with different inter-class objects.\n    E.g., different triangles\n\n9. Check the effect of pretraining using available datasets\n10. Read about one data sample transfer learning \n'
In [2]:
import os
import time

import numpy as np
import cv2
import matplotlib.pyplot as plt

import torch
import torchvision
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tensorboardX import SummaryWriter
from torchsummary import summary
In [8]:
backgrounds_dir = './dataset/backgrounds/'
target_dir = './dataset/target/'
test_dir = './dataset/test/'
In [9]:
# see how the target looks like
img = cv2.imread(target_dir+'target_shape_marked.png')
print('image_shape: ', img.shape)

img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

#plt the original image
plt.figure(figsize=(16,16))
plt.subplot(121), plt.imshow(img)

# Find red pixels - maybe the slowest method :) 
upper_left_indicator = 0
for row_idx in range(img.shape[0]):
    for col_idx in range(img.shape[1]):
        r = img[row_idx][col_idx][0]
        g = img[row_idx][col_idx][1]
        b = img[row_idx][col_idx][2]
        
        if r>100 and g<100 and b<100 and upper_left_indicator == 0:
            #upper left row and column idxs
            uleft_colrow_idx = (col_idx, row_idx)
            upper_left_indicator=1

        if upper_left_indicator == 1 and r>100 and g<100 and b<100:

            #lower right row and column idxs
            lright_colrow_idx = (col_idx, row_idx)
        
        

print(uleft_colrow_idx)
print(lright_colrow_idx)
img_marked = cv2.circle(img.copy(), uleft_colrow_idx ,50, (0,255,0), 10)
img_marked = cv2.circle(img_marked, lright_colrow_idx ,50, (0,255,0), 10)

#plot the image with added circles to check if the corner detection is fine
plt.subplot(122), plt.imshow(img_marked)
plt.show()
            
            
image_shape:  (1651, 1275, 3)
(176, 1134)
(456, 1370)
In [10]:
# see how the background files look like
fnames = os.listdir(backgrounds_dir)
for fname in fnames:
    bg_img = cv2.imread(backgrounds_dir+fname, 0)
    assert (img.shape[0], img.shape[1]) == (bg_img.shape[0], bg_img.shape[1]) , 'Check dimensions'
    
print('we are cool!')
we are cool!
In [11]:
# cropping 

col_ul, row_ul = uleft_colrow_idx
col_lr, row_lr = lright_colrow_idx
print(row_ul,row_lr) , print(col_ul,col_lr)

 
#object of interest
#red rectangle thickness
eps = 5
ooi = img[row_ul+eps:row_lr-eps,col_ul+eps:col_lr-eps,:].copy()
print(ooi.shape)
plt.imshow(ooi)
plt.show()
1134 1370
176 456
(226, 270, 3)
In [12]:
h,w,c = ooi.shape
H,W,C = img.shape
print('h/H:{:.2f}, w/W:{:.2f}'.format(h/H, w/W))

#convert to gray
ooi_g = cv2.cvtColor(ooi, cv2.COLOR_RGB2GRAY)
plt.imshow(ooi_g, cmap='gray')
plt.show()
h/H:0.14, w/W:0.21
In [13]:
import imgaug as ia
from imgaug import augmenters as iaa
ia.seed(0)

aug = iaa.Affine(
    scale={"x": (0.1, 1.5), "y": (0.1, 1.5)}, 
    rotate=(-90,+90),
    shear=(-10,10),
    cval=(255),
    fit_output=True)

img_batch = [ooi_g, ooi_g,ooi_g,ooi_g,ooi_g,ooi_g,ooi_g,ooi_g,ooi_g]
augmented_batch= aug(images = img_batch)

fig = plt.figure(figsize=(16,16))
for idx, aug_img in enumerate(augmented_batch):
    plt.subplot(3,3,idx+1)
    plt.imshow(aug_img, cmap='gray')
    
plt.show()
In [14]:
import imgaug as ia
from imgaug import augmenters as iaa

seq_aug = iaa.Sequential([
    iaa.Fliplr(0.5), # horizontal flips
    
    # Apply affine transformations to each image.
    # Scale/zoom them, translate/move them, rotate them and shear them.
    iaa.Affine(
        scale={"x": (0.4, 1.1), "y": (0.4, 1.1)},
        translate_percent={"x": (-0.1, 0.1), "y": (-0.1, 0.1)},
        rotate=(-25, 25),
        shear=(-8, 8),
        cval=(255)
    )
], random_order=True) # apply augmenters in random order

augmented_batch = seq_aug(images=img_batch)

augmented_batch = np.hstack(augmented_batch)
print('Augmented_batch_shape: ',augmented_batch.shape)
ia.imshow(augmented_batch)
Augmented_batch_shape:  (226, 2430)
In [15]:
bg_img = cv2.imread(backgrounds_dir+fnames[0],0)
t_img = ooi_g.copy()

print('bg_img_shape: ', bg_img.shape)
print('crop_img_shape: ', ooi_g.shape)

SIZE = 256

#background and target specs
bg_H, bg_W = bg_img.shape
h, w = t_img.shape

t_dim = (int(SIZE/bg_W*w),int(SIZE/bg_H*h))
        
bg_img = cv2.resize(bg_img,(SIZE,SIZE), cv2.INTER_AREA)
t_img = cv2.resize(t_img,t_dim, cv2.INTER_LANCZOS4)

print('bg_img_shape: ', bg_img.shape)
print('crop_img_shape: ',t_img.shape)

plt.figure(figsize=(16,16))
plt.subplot(121),plt.imshow(bg_img, cmap='gray')

plt.subplot(122), plt.imshow(t_img, cmap='gray')
plt.plot(70,50,'or')
plt.xlim([0,SIZE])
plt.ylim([0,SIZE])
plt.show()
bg_img_shape:  (1651, 1275)
crop_img_shape:  (226, 270)
bg_img_shape:  (256, 256)
crop_img_shape:  (35, 54)
In [18]:
#do transformations
seq_aug = iaa.Sequential([
    iaa.Fliplr(0.5), # horizontal flips
    iaa.Affine(
        scale={"x": (0.9, 1.1), "y": (0.9, 1.1)},
        translate_percent={"x": (-0.05, 0.05), "y": (-0.05, 0.05)},
        rotate=(-90, 90),
        shear=(-16, 16),
        cval=(255),
        fit_output=False
    )
], random_order=True) # apply augmenters in random order

augmented_t = seq_aug(image = t_img)
plt.figure(figsize=(16,6))
plt.subplot(121),plt.imshow(t_img, cmap = 'gray')
plt.subplot(122), plt.imshow(augmented_t, cmap = 'gray')
plt.show()
print(augmented_t.shape)

#paste on top of the document
#get resized specs
H_pix, W_pix = bg_img.shape
h_pix,w_pix = augmented_t.shape

#sample upper left col and row idxs
ulcol_idx = np.random.randint(0,(W_pix-w_pix))
ulrow_idx = np.random.randint(0,(H_pix-h_pix))

crow_idx = ulrow_idx + h_pix//2
ccol_idx = ulcol_idx + w_pix//2

STRIDE = 32
target = np.zeros((SIZE//STRIDE, SIZE//STRIDE), dtype = np.uint8)
target[crow_idx//32,ccol_idx//32] = 1
#add target on top the background
fused_img = bg_img.copy()
fused_img[ulrow_idx:ulrow_idx+h_pix,ulcol_idx:ulcol_idx+w_pix] = augmented_t

plt.figure(figsize=(16,16))
plt.subplot(121),plt.imshow(fused_img, cmap='gray')
plt.subplot(122),plt.imshow(target, cmap='gray')
plt.show()
(35, 54)
In [21]:
STRIDE = 32
N_max = 9
N = np.random.randint(1,N_max+1)
print('N:', N)
target = np.zeros((SIZE//STRIDE, SIZE//STRIDE), dtype = np.uint8)

bg_canvas = np.zeros_like(bg_img)

for i in range(N):
    #we can add resize here
    augmented_t = seq_aug(image = t_img)
    
    #get resized specs
    H_pix, W_pix = bg_img.shape
    h_pix, w_pix = augmented_t.shape

    if i == 0:        
        #sample upper left col and row idxs
        ulcol_idx = np.random.randint(0,(W_pix-w_pix))
        ulrow_idx = np.random.randint(0,(H_pix-h_pix))
        
        #get the idxs for the center pixel
        crow_idx = ulrow_idx + h_pix//2
        ccol_idx = ulcol_idx + w_pix//2
        
        #add target on top the background
        fused_img = bg_img.copy()
        fused_img[ulrow_idx:ulrow_idx+h_pix,ulcol_idx:ulcol_idx+w_pix] = augmented_t
        
        #create the label matrix
        target[crow_idx//STRIDE,ccol_idx//STRIDE] = 1
        
        #add changes to the background canvas
        bg_canvas[ulrow_idx:ulrow_idx+h_pix,ulcol_idx:ulcol_idx+w_pix] = 1
    else:  
        
        dummy_canvas = np.ones_like(bg_canvas)
        while (dummy_canvas*bg_canvas).sum() > 0:
             #sample upper left col and row idxs
            ulcol_idx = np.random.randint(0,(W_pix-w_pix))
            ulrow_idx = np.random.randint(0,(H_pix-h_pix))

            #get the idxs for the center pixel
            crow_idx = ulrow_idx + h_pix//2
            ccol_idx = ulcol_idx + w_pix//2

            dummy_canvas = np.zeros_like(bg_canvas)
            dummy_canvas[ulrow_idx:ulrow_idx+h_pix,ulcol_idx:ulcol_idx+w_pix] = 1
        
        #add target on top the background
        fused_img[ulrow_idx:ulrow_idx+h_pix,ulcol_idx:ulcol_idx+w_pix] = augmented_t
        
        #create the label matrix
        target[crow_idx//STRIDE,ccol_idx//STRIDE] = 1
        
        #add changes to the background canvas
        bg_canvas[ulrow_idx:ulrow_idx+h_pix,ulcol_idx:ulcol_idx+w_pix] = 1
        
plt.figure(figsize=(16,16))
plt.subplot(121),plt.imshow(fused_img)
plt.subplot(122),plt.imshow(target)
plt.show()       
N: 2
In [24]:
import imgaug as ia
from imgaug import augmenters as iaa


class OneShot_Dataset(Dataset):

    def __init__(self, background_dir = './dataset/backgrounds/', target_dir= './dataset/target/',
                target_fname = 'target_shape_marked.png'):

        self.background_dir = background_dir
        self.target_dir = target_dir
        
        self.background_fnames = os.listdir(self.background_dir)
        self.target_fname = target_fname
        
        self.idx_range = int(len(self.background_fnames))
        self.target_image = self.crop_the_target_shape()
        self.augmentation_function = self.sequential_augmentation_function()

    def __len__(self):
        return int(self.idx_range)

    def __getitem__(self, index):
        
        #read a background
        bg_path = os.path.join(self.background_dir, self.background_fnames[index])
        bg_img = cv2.imread(bg_path,0)
        
        image_npa, label_npa = self.create_io_arrays(bg_img)
        
        input_tensor = torch.from_numpy(image_npa).float()
        output_tensor = torch.from_numpy(label_npa).float()

        return input_tensor, output_tensor
    
    def crop_the_target_shape(self):
        # see how the target looks like
        img = cv2.cvtColor(cv2.imread(target_dir+'target_shape_marked.png'), cv2.COLOR_BGR2RGB)
        print('document_image_shape: ', img.shape)

        # Find red pixels - maybe the slowest method :) 
        upper_left_indicator = 0
        for row_idx in range(img.shape[0]):
            for col_idx in range(img.shape[1]):
                r = img[row_idx][col_idx][0]
                g = img[row_idx][col_idx][1]
                b = img[row_idx][col_idx][2]

                if r>100 and g<100 and b<100 and upper_left_indicator == 0:
                    #upper left row and column idxs
                    uleft_colrow_idx = (col_idx, row_idx)
                    upper_left_indicator=1

                if upper_left_indicator == 1 and r>100 and g<100 and b<100:

                    #lower right row and column idxs
                    lright_colrow_idx = (col_idx, row_idx)


        img_marked = cv2.circle(img.copy(), uleft_colrow_idx ,50, (0,255,0), 10)
        img_marked = cv2.circle(img_marked, lright_colrow_idx ,50, (0,255,0), 10)

        #plot the the original image and the image with added circles 
        fig = plt.figure(figsize=(16,16))
        plt.subplot(121), plt.imshow(img)
        plt.subplot(122), plt.imshow(img_marked)
        plt.show()

        #object of interest
        eps = 5 #red rectangle thickness
        ooi = img[row_ul+eps:row_lr-eps,col_ul+eps:col_lr-eps,:].copy()

        #convert to gray
        ooi_g = cv2.cvtColor(ooi, cv2.COLOR_RGB2GRAY)

        h,w,c = ooi.shape
        H,W,C = img.shape
        
        print('object_of_interest_shape: ', ooi.shape)
        print('h/H:{:.2f}, w/W:{:.2f}'.format(h/H, w/W))
        
        h_ratio = h/H
        w_ratio = w/W

        return ooi_g
    
    def sequential_augmentation_function(self):
        #do transformations
        seq_aug = iaa.Sequential([
            iaa.Fliplr(0.5), # horizontal flips

            # Apply affine transformations to each image.
            # Scale/zoom them, translate/move them, rotate them and shear them.
            iaa.Affine(
                scale={"x": (0.4, 1.1), "y": (0.4, 1.1)},
                translate_percent={"x": (-0.1, 0.1), "y": (-0.1, 0.1)},
                rotate=(-25, 25),
                shear=(-8, 8),
                cval=(255)
            )
        ], random_order=True) # apply augmenters in random order

        return seq_aug
    
    def create_io_arrays(self, bg_img, N_max=4, i_size=256, stride = 32):
        
        #background and target specs
        bg_H, bg_W = bg_img.shape
        h, w = self.target_image.shape
        
        #respective target object dims
        t_dim = (int(i_size/bg_W*w),int(i_size/bg_H*h))
        
        #resize bg and target object
        bg_img = cv2.resize(bg_img,(i_size,i_size), cv2.INTER_LANCZOS4)
        t_img = cv2.resize(self.target_image,t_dim, cv2.INTER_LANCZOS4)
        
        #sample number of objects to dist on the bg
        N = np.random.randint(1,N_max+1)
        
        #initialize the target tensor
        target_tensor = np.zeros((i_size//stride, i_size//stride), dtype = np.uint8)
        
        #canvas to not to put objects on top of eachother
        bg_canvas = np.zeros_like(bg_img)

        for i in range(N):
            
            #we can add resize here
            augmented_t = self.augmentation_function(image = t_img)

            #get resized specs
            H_pix, W_pix = bg_img.shape
            h_pix, w_pix = augmented_t.shape

            if i == 0:

                #sample upper left col and row idxs
                ulcol_idx = np.random.randint(0,(W_pix-w_pix))
                ulrow_idx = np.random.randint(0,(H_pix-h_pix))

                #get the idxs for the center pixel
                crow_idx = ulrow_idx + h_pix//2
                ccol_idx = ulcol_idx + w_pix//2

                #add target on top the background
                fused_img = bg_img.copy()
                fused_img[ulrow_idx:ulrow_idx+h_pix,ulcol_idx:ulcol_idx+w_pix] = augmented_t

                #create the label matrix
                target_tensor[crow_idx//STRIDE,ccol_idx//STRIDE] = 1

                #add changes to the background canvas
                bg_canvas[ulrow_idx:ulrow_idx+h_pix,ulcol_idx:ulcol_idx+w_pix] = 1
                
            else:  

                dummy_canvas = np.ones_like(bg_canvas)
                while (dummy_canvas*bg_canvas).sum() > 0:
                     #sample upper left col and row idxs
                    ulcol_idx = np.random.randint(0,(W_pix-w_pix))
                    ulrow_idx = np.random.randint(0,(H_pix-h_pix))

                    #get the idxs for the center pixel
                    crow_idx = ulrow_idx + h_pix//2
                    ccol_idx = ulcol_idx + w_pix//2

                    dummy_canvas = np.zeros_like(bg_canvas)
                    dummy_canvas[ulrow_idx:ulrow_idx+h_pix,ulcol_idx:ulcol_idx+w_pix] = 1

                #add target on top the background
                fused_img[ulrow_idx:ulrow_idx+h_pix,ulcol_idx:ulcol_idx+w_pix] = augmented_t

                #create the label matrix
                target_tensor[crow_idx//STRIDE,ccol_idx//STRIDE] = 1

                #add changes to the background canvas
                bg_canvas[ulrow_idx:ulrow_idx+h_pix,ulcol_idx:ulcol_idx+w_pix] = 1
        
        return fused_img, target_tensor


    
In [25]:
triangle_dataset = OneShot_Dataset()
print('Dataset_length: ',len(triangle_dataset))
document_image_shape:  (1651, 1275, 3)
object_of_interest_shape:  (226, 270, 3)
h/H:0.14, w/W:0.21
Dataset_length:  162
In [26]:
#test 1-2
input_tensor, output_tensor = triangle_dataset[0]
print(input_tensor.shape, output_tensor.shape)
input_npa = input_tensor.numpy()
output_npa = output_tensor.numpy()
plt.figure(figsize=(16,8))
plt.subplot(121), plt.imshow(input_npa)
plt.subplot(122), plt.imshow(output_npa)
plt.show()
torch.Size([256, 256]) torch.Size([8, 8])
In [27]:
#define the neural network class
class Mark_1(nn.Module):
    def __init__(self):
        super(Mark_1, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            # nn.LeakyReLU(),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.layer2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            # nn.LeakyReLU(),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.layer3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            # nn.LeakyReLU(),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.layer4 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            # nn.LeakyReLU(),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.layer5 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=1, stride=1, padding=0),
            nn.BatchNorm2d(512),
            # nn.LeakyReLU()
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)

        )
        self.layer6 = nn.Sequential(
            nn.Conv2d(512, 1, kernel_size=1, stride=1, padding=0),
            # nn.Sigmoid()
            # nn.LeakyReLU()
        )

    def forward(self, image):
        out = self.layer1(image)  # (N,
        out = self.layer2(out)  # (N,
        out = self.layer3(out)  # (N,
        out = self.layer4(out)  # (N,
        out = self.layer5(out)  # (N,
        out = self.layer6(out)  # (N,1,16,16)

        return out
In [28]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('device : ', device)

model = Mark_1().to(device)
print(model)
summary(model, input_size=(1,256,256))
device :  cpu
Mark_1(
  (layer1): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (layer2): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (layer3): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (layer4): Sequential(
    (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (layer5): Sequential(
    (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1))
    (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (layer6): Sequential(
    (0): Conv2d(512, 1, kernel_size=(1, 1), stride=(1, 1))
  )
)
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Conv2d-1         [-1, 32, 256, 256]             320
       BatchNorm2d-2         [-1, 32, 256, 256]              64
              ReLU-3         [-1, 32, 256, 256]               0
         MaxPool2d-4         [-1, 32, 128, 128]               0
            Conv2d-5         [-1, 64, 128, 128]          18,496
       BatchNorm2d-6         [-1, 64, 128, 128]             128
              ReLU-7         [-1, 64, 128, 128]               0
         MaxPool2d-8           [-1, 64, 64, 64]               0
            Conv2d-9          [-1, 128, 64, 64]          73,856
      BatchNorm2d-10          [-1, 128, 64, 64]             256
             ReLU-11          [-1, 128, 64, 64]               0
        MaxPool2d-12          [-1, 128, 32, 32]               0
           Conv2d-13          [-1, 256, 32, 32]         295,168
      BatchNorm2d-14          [-1, 256, 32, 32]             512
             ReLU-15          [-1, 256, 32, 32]               0
        MaxPool2d-16          [-1, 256, 16, 16]               0
           Conv2d-17          [-1, 512, 16, 16]         131,584
      BatchNorm2d-18          [-1, 512, 16, 16]           1,024
             ReLU-19          [-1, 512, 16, 16]               0
        MaxPool2d-20            [-1, 512, 8, 8]               0
           Conv2d-21              [-1, 1, 8, 8]             513
================================================================
Total params: 521,921
Trainable params: 521,921
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.25
Forward/backward pass size (MB): 100.75
Params size (MB): 1.99
Estimated Total Size (MB): 102.99
----------------------------------------------------------------
In [29]:
# create dataloader
train_loader = torch.utils.data.DataLoader(dataset=triangle_dataset,
                                            batch_size=8,
                                            shuffle=True) 
In [30]:
learning_rate = 1e-3
batch_size = 32
num_epochs = 30

lmda_noobj = 0.5

criterion_MSE = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)
In [31]:
loss_list=[]
epoch_iteration_no=0
In [32]:
#training loop
model.train()
for epoch in range(num_epochs):
    start_t = time.time()
    for i, (images,labels) in enumerate(train_loader):
        
        
        images = images.view(-1,1,256,256).float().to(device)
        labels = labels.view(-1,1,8,8).float().to(device)
#         print('images_shape: ',images.shape)
#         print('labels_shape: ',labels.shape)
        
        targets_flat = labels[:,0,:,:].flatten().to(device)
        
        if i == 0 and epoch ==0 :
            print('i==0,epoch ==0 ',images.shape,labels.shape)
        
        #forward pass
        outputs = model(images)
#         print('outputs_shape: ',outputs.shape)

        #disect the output
        outputs_flat = outputs[:,0,:,:].flatten()
        
        #mask the grid cells that has obj cm
        mask_obj = targets_flat>0
        mask_noobj = targets_flat==0
        
        #objectness loss
        obj_loss_obj = criterion_MSE(outputs_flat[mask_obj],targets_flat[mask_obj])
        obj_loss_noobj = criterion_MSE(outputs_flat[mask_noobj],targets_flat[mask_noobj])
        
        loss = obj_loss_obj + lmda_noobj*obj_loss_noobj
        loss_list.append(loss.item())
        
        #backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    end_t = time.time()
    print('Epoch [{}/{}], loss: {:.12f}, time:{}'.format(epoch+1,num_epochs,loss.item(),end_t-start_t))
    print()                
    
#     if (epoch+1)%save_ckpt_stepn == 0:
#         checkp_name = out_dir+'{}_epoch{}'.format(detector_name,(epoch+1)+(epoch_iteration_no)*(num_epochs))
#         print('saving model ckpt... \n',checkp_name)
#         torch.save(model.state_dict(),checkp_name)
        
            
epoch_iteration_no +=1
i==0,epoch ==0  torch.Size([8, 1, 256, 256]) torch.Size([8, 1, 8, 8])
Epoch [1/30], loss: 0.264348655939, time:15.785492897033691

Epoch [2/30], loss: 0.082547165453, time:15.30480432510376

Epoch [3/30], loss: 0.120148405433, time:14.352236270904541

Epoch [4/30], loss: 0.034708395600, time:14.174126625061035

Epoch [5/30], loss: 0.130309730768, time:14.9504075050354

Epoch [6/30], loss: 0.050018832088, time:14.885019063949585

Epoch [7/30], loss: 0.029306024313, time:15.636901378631592

Epoch [8/30], loss: 0.036454901099, time:14.706599950790405

Epoch [9/30], loss: 0.066177204251, time:14.275766134262085

Epoch [10/30], loss: 0.032805752009, time:14.484607934951782

Epoch [11/30], loss: 0.061860725284, time:14.591905117034912

Epoch [12/30], loss: 0.076783090830, time:14.324437618255615

Epoch [13/30], loss: 0.017738243565, time:14.743742227554321

Epoch [14/30], loss: 0.029222091660, time:14.688948154449463

Epoch [15/30], loss: 0.083028420806, time:14.710556030273438

Epoch [16/30], loss: 0.017876584083, time:15.495397806167603

Epoch [17/30], loss: 0.030922397971, time:15.668429851531982

Epoch [18/30], loss: 0.031425751746, time:15.602123260498047

Epoch [19/30], loss: 0.037137225270, time:16.56112551689148

Epoch [20/30], loss: 0.209007501602, time:14.721665382385254

Epoch [21/30], loss: 0.061413433403, time:16.252472400665283

Epoch [22/30], loss: 0.031725440174, time:15.144721508026123

Epoch [23/30], loss: 0.067328996956, time:15.474965333938599

Epoch [24/30], loss: 0.040740057826, time:15.121220111846924

Epoch [25/30], loss: 0.064443685114, time:15.150021314620972

Epoch [26/30], loss: 0.030545638874, time:14.821189403533936

Epoch [27/30], loss: 0.020826719701, time:18.61385989189148

Epoch [28/30], loss: 1.355314731598, time:16.79782724380493

Epoch [29/30], loss: 0.074226364493, time:15.371830940246582

Epoch [30/30], loss: 0.049488272518, time:14.173985958099365

In [33]:
plt.plot(loss_list[:])
Out[33]:
[<matplotlib.lines.Line2D at 0x7fefef5d1c10>]
In [34]:
#Test the model w/ all testing data
dataset = OneShot_Dataset()
data_loader = torch.utils.data.DataLoader(dataset=dataset,
                                            batch_size=1,
                                            shuffle=False) 
document_image_shape:  (1651, 1275, 3)
object_of_interest_shape:  (226, 270, 3)
h/H:0.14, w/W:0.21
In [35]:
images_npa = np.zeros((len(data_loader),256,256))
labels_npa = np.zeros((len(data_loader),8,8))
outputs_npa = np.zeros((len(data_loader),8,8))

model.eval() #eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance)
with torch.no_grad():
    for idx, (images,labels) in enumerate(data_loader):
        images = images.view(-1,1,256,256).float().to(device)
        targets = labels.view(-1,1,8,8).float().to(device)
        #class_preds,x_preds,y_preds = model(images)
        out = model(images)
        images_npa[idx] = images.squeeze().detach().numpy()
        labels_npa[idx] = targets.squeeze().detach().numpy()
        outputs_npa[idx] = out.squeeze().detach().numpy()



print('images_npa: ',images_npa.shape)
print('labels_npa: ',labels_npa.shape)
print('outputs_npa: ',outputs_npa.shape)
images_npa:  (162, 256, 256)
labels_npa:  (162, 8, 8)
outputs_npa:  (162, 8, 8)
In [36]:
idx= 4

row=1
col=3

plt.figure(figsize=(16,8))
plt.subplot(row,col,1), plt.imshow(images_npa[idx])
plt.subplot(row,col,2), plt.imshow(labels_npa[idx])
plt.subplot(row,col,3), plt.imshow(outputs_npa[idx])
plt.plot()
Out[36]:
[]
In [41]:
# how about testset?
fnames = os.listdir(test_dir)
ex_img = cv2.imread(test_dir+fnames[0],0)
H,W = ex_img.shape

print('H,W: ', ex_img.shape)
print('Number of test examples: ', len(fnames))

#create empth npas
test_images = np.zeros((len(fnames),H,W))
test_images_resized = np.zeros((len(fnames),256,256))

#read and resize all the test files
for idx, fname in enumerate(fnames):
    img = cv2.imread(test_dir+fname,0)
    img_resized = cv2.resize(img,(256,256),cv2.INTER_LANCZOS4)
    
    test_images[idx] = img
    test_images_resized[idx] = img_resized

#create the torch tensor to feed
test_tensor = torch.from_numpy(test_images_resized)
print(test_tensor.shape)
H,W:  (1651, 1275)
Number of test examples:  14
torch.Size([14, 256, 256])
In [46]:
# run the model on the resized test document images
model.eval()
with torch.no_grad():
    test_outputs = model(test_tensor.view(-1,1,256,256).float()).squeeze()
    
test_outputs = test_outputs.numpy()
In [47]:
# Threshold the outputs
test_outputs[test_outputs<0.3]=0
In [48]:
row=14
col=2
fig = plt.figure(figsize=(16,128))
for i in range(len(fnames)):
    plt.subplot(row,col,2*i+1), plt.imshow(test_images_resized[i])
    plt.subplot(row,col,2*i+2), plt.imshow(test_outputs[i])
    
plt.show()
In [49]:
test_outputs_resized = np.zeros_like(test_images)
for i in range(11):
    test_outputs_resized[i] = cv2.resize(test_outputs[i], (W,H), cv2.INTER_AREA)
plt.imshow(test_outputs_resized[1])
test_outputs_resized.max()
Out[49]:
0.6169332265853882
In [50]:
final_heatmaps = np.zeros_like(test_images)
for i in range(11):
    final_heatmaps[i] = cv2.addWeighted(test_images[i],0.1,test_outputs_resized[i]*255,0.9,0)
    
In [51]:
selected_idxs = [0,1,2,3,5,7,8,9,10]
row=len(selected_idxs)
col=2
fig = plt.figure(figsize=(16,80))
subplot_counter = 1
for i in selected_idxs:
    
    fig.add_subplot(row,col,subplot_counter) 
    plt.imshow(test_images[i],cmap='gray') 
    subplot_counter+=1
    
    fig.add_subplot(row,col,subplot_counter)
    plt.imshow(final_heatmaps[i],cmap='gray')
    subplot_counter+=1
    
plt.show()
In [ ]: